#Replication material for: 
#When Censorship Works: 
#Exploring the Resilience of News Websites to Online Censorship
#British Journal of Political Science
#Author: Philipp M. Lutscher

#This script prepares the main datasets used in the analyses and 
#produces the descriptive findings presented in the appendix

#Remove objects
rm(list = ls())

#Measure duration
start <- Sys.time()

###Load packages ####
library(tidyverse)
library(lubridate)
library(readxl)
library(imputeTS)
library(xtable)

#Load helper functions
source("helper_functions.R")

### Prepare data ####

#Load sample
media_egypt_list <- read_excel("input/media_egypt_list.xlsx")

#Transform date
media_egypt_list$estimated_date_of_determining_blocking <- 
  as.character(media_egypt_list$estimated_date_of_determining_blocking)

#Add site variable
media_egypt_list <- mutate(media_egypt_list, site = ifelse(!is.na(domain_name_new),paste0(domain_name,"/",domain_name_new),
                                                           domain_name))

#Distinguish between Egyptian and international outlets
media_egypt_list_domestic <- filter(media_egypt_list,`main audience / scope` == "Egypt")
media_egypt_list_inter <- filter(media_egypt_list,`main audience / scope` != "Egypt")

#Load traffic data
sample_traffic_out <- read_delim("input/sample_traffic_out.csv", 
                                 ";", escape_double = FALSE, trim_ws = TRUE)

#Contains the following outcome variables:
## Page views per million: Out of every million pageviews by all users on the internet today, how many were made to this site?
## Reach per million: Out of every million users on the internet today, how many visited this site?
## Page views per user: What is the average number of pageviews to this site, per user, per day?

#Transform date
sample_traffic_out$date <- dmy(sample_traffic_out$date)

#Merge websites that changed domain name
changing_domains <- filter(media_egypt_list,!is.na(domain_name_new))

#Combine traffic for websites that changed domain names (considers NAs to be zero for now)
for (i in seq(nrow(changing_domains))) {
  changing_domains$domain_name[i]
  merg1 <- filter(sample_traffic_out,site == changing_domains$domain_name[i])
  merg2 <- filter(sample_traffic_out,site == changing_domains$domain_name_new[i])
  
  merg1$page_views_per_million[is.na(merg1$page_views_per_million)] <- 0  
  merg2$page_views_per_million[is.na(merg2$page_views_per_million)] <- 0  
  
  merg1$page_views_per_user[is.na(merg1$page_views_per_user)] <- 0  
  merg2$page_views_per_user[is.na(merg2$page_views_per_user)] <- 0  
  
  merg1$reach_per_million[is.na(merg1$reach_per_million)] <- 0  
  merg2$reach_per_million[is.na(merg2$reach_per_million)] <- 0  
  
  merg1$site <- paste0(merg1$site,"/",merg2$site)
  merg1$page_views_per_million <- merg1$page_views_per_million + merg2$page_views_per_million
  merg1$page_views_per_user <- merg1$page_views_per_user + merg2$page_views_per_user
  merg1$reach_per_million <- merg1$reach_per_million + merg2$reach_per_million
  if (i == 1) {
    merg_out <- merg1
  }
  else{
    merg_out <- bind_rows(merg_out,merg1)
  }
}

#Zeros are again considered as NAs for the merged websites
merg_out <- merg_out %>% mutate(page_views_per_million = ifelse(page_views_per_million == 0,
                                                                NA,page_views_per_million),
                                page_views_per_user = ifelse(page_views_per_user == 0,
                                                             NA,page_views_per_user),
                                reach_per_million = ifelse(reach_per_million == 0,
                                                           NA,reach_per_million))

#Add those websites to the original traffic dataset
sample_traffic_out <- dplyr::bind_rows(filter(sample_traffic_out, site %in% filter(media_egypt_list,is.na(domain_name_new))$domain_name),
                                       merg_out)

sample_traffic_out <- left_join(sample_traffic_out,media_egypt_list, by = "site")

#Create blocking variable
sample_traffic_out <- sample_traffic_out %>% 
  mutate(blocked = ifelse(date >= estimated_date_of_determining_blocking,1,0),
         blocked = ifelse(is.na(blocked),0,blocked))

#Update NAs for traffic data:
#Breaks up to 7 days are linear imputed
sample_traffic_final_out <- fill_in_traffic(7)

#What is the smallest observed value for "reach per million"?
minum_value_reach <- min(sample_traffic_final_out$reach_per_million,na.rm = T)

#If breaks >7 then traffic is assumed to be zero or 0.04 as robustness test
sample_traffic_final_out <- sample_traffic_final_out %>% mutate(
  reach_per_million_004 = ifelse(is.na(reach_per_million),minum_value_reach,reach_per_million),
  page_views_per_million = ifelse(is.na(page_views_per_million),0,page_views_per_million),
  page_views_per_user = ifelse(is.na(page_views_per_user),0,page_views_per_user),
  reach_per_million = ifelse(is.na(reach_per_million),0,reach_per_million),
  reach_per_million3 = ifelse(is.na(reach_per_million3),0,reach_per_million3),
  reach_per_million14 = ifelse(is.na(reach_per_million14),0,reach_per_million14),
)

#Create variable that indicates whether outlet is in Egypt
sample_traffic_final_out <- mutate(sample_traffic_final_out,egypt = ifelse(`main audience / scope` == "Egypt",1,0))

#Descriptive information on Egyptian outlets
sample_traffic_final_out_descriptive <- sample_traffic_final_out %>% group_by(site) %>% 
  summarize(blocked = max(blocked),stance = unique(stance),egypt = max(egypt))

sample_traffic_final_out_descriptive <- filter(sample_traffic_final_out_descriptive,egypt == 1)

#Reported in paragraph: Descriptives (Blocked == 1)

#Type of website
table(sample_traffic_final_out_descriptive$stance,sample_traffic_final_out_descriptive$blocked)

#Temporal patterns (number of blocked outlets, cumulative)
filter(sample_traffic_final_out,egypt == 1) %>% mutate(year = year(date)) %>%
  group_by(year,site) %>% summarize(blocked = max(blocked)) %>% 
  group_by(year) %>% summarize(blocked = sum(blocked))

#Create two datasets one on Egyptian outlets and one on other outlets
sample_traffic_final_out_egypt <- filter(sample_traffic_final_out,`main audience / scope` == "Egypt")
sample_traffic_final_out_other <- filter(sample_traffic_final_out,`main audience / scope` != "Egypt")

#Generate average size variable before May 2017

#Egyptian outlets:
sample_traffic_final_out_egypt <- mutate(sample_traffic_final_out_egypt,before = ifelse(date < "2017-05-24",1,0))
traffic_out_final_beg <- filter(sample_traffic_final_out_egypt,before == 1) %>% group_by(site) %>% 
  summarize(traffic = mean(reach_per_million,na.rm = T))

traffic_out_final_beg <- traffic_out_final_beg %>% mutate(size = case_when(
  traffic <= median(traffic) ~ "Below median",
  traffic > median(traffic) ~ "Above median"))

#Merge with main dataset for later analysis
sample_traffic_final_out_egypt <- left_join(sample_traffic_final_out_egypt,traffic_out_final_beg,by = "site",all.x = T)

#Merge with descriptive information on sample
media_egypt_list_domestic <- left_join(media_egypt_list_domestic,traffic_out_final_beg,by = "site",all.x = T)

#Report Table of outlets:
print(xtable(media_egypt_list_domestic[,c(2:3,5:9,10:13,17,25)]),include.rownames = FALSE,
      type = "html",file = "output/tables/table_A1_2_sample.html")

#International outlets
sample_traffic_final_out_other <- mutate(sample_traffic_final_out_other,before = ifelse(date < "2017-05-24",1,0))

traffic_out_final_beg <- filter(sample_traffic_final_out_other,before == 1) %>% group_by(site) %>% summarize(traffic = mean(reach_per_million,na.rm = T))

traffic_out_final_beg <- traffic_out_final_beg %>% mutate(size = case_when(
  traffic <= median(traffic) ~ "Below median",
  traffic > median(traffic) ~ "Above median"
))

sample_traffic_final_out_other <- left_join(sample_traffic_final_out_other,traffic_out_final_beg,by = "site",all.x = T)

#Print descriptives for foreign websites
media_egypt_list_inter <- left_join(media_egypt_list_inter,traffic_out_final_beg,by = "site",all.x = T)

print(xtable(media_egypt_list_inter[,c(2:3,5:9,10:13,17,25)]),include.rownames = FALSE,
      type = "html",file = "output/tables/table_H3_sample_international.html")


#Create dataset for traffic analysis (only censored outlets)
sample_egypt_censored <- filter(sample_traffic_final_out_egypt,blocking != "not blocked" 
                                & !is.na(estimated_date_of_determining_blocking)) %>% 
  mutate(estimated_date_of_determining_blocking = as.Date(estimated_date_of_determining_blocking),
         before_after_time = date - estimated_date_of_determining_blocking,
         blocked = ifelse(before_after_time >= 0, 1, 0),
         week = factor(lubridate::week(date)),
         weekday = factor(lubridate::wday(date)),
         month = factor(lubridate::month(date)),
         year = factor(lubridate::year(date)),
         stance = factor(stance),
         size = factor(size),
         site = factor(site)) %>% #Filter dates to have running variable fully for all outlets
  filter(before_after_time >= (as.Date("2017-01-01") - as.Date("2017-05-24")) #First date of censoring
         & before_after_time <= as.Date("2021-03-15") - as.Date("2019-09-26")) #Last date of censoring

sample_egypt_censored$before_after_time <- as.numeric(sample_egypt_censored$before_after_time)


#Appendix C: Information on filled in NAs distinguishing small vs. larger outlets as presented #
sample_traffic_nas <- sample_traffic_final_out_egypt %>% 
  mutate(zeros = ifelse(reach_per_million == 0,1,0)) %>% group_by(site) %>%
  summarize(zeros = sum(zeros),nas = sum(is.na(reach_per_million_original)),
            size = unique(size),blocking = unique(blocking))

sample_traffic_nas <- sample_traffic_nas %>% mutate(nas = nas - zeros)
zeros <- sum(sample_traffic_nas$zeros)/nrow(sample_traffic_final_out_egypt)
nas <- sum(sample_traffic_nas$nas)/nrow(sample_traffic_final_out_egypt)

zeros_above <- sum(filter(sample_traffic_nas,size == "Above median")$zeros) /
  nrow(filter(sample_traffic_final_out_egypt,size == "Above median"))
nas_above <- sum(filter(sample_traffic_nas,size == "Above median")$nas) /
  nrow(filter(sample_traffic_final_out_egypt,size == "Above median"))

zeros_below <- sum(filter(sample_traffic_nas,size == "Below median")$zeros) /
  nrow(filter(sample_traffic_final_out_egypt,size == "Below median"))
nas_below <- sum(filter(sample_traffic_nas,size == "Below median")$nas) /
  nrow(filter(sample_traffic_final_out_egypt,size == "Below median"))

na_table1 <- cbind(nas,zeros,nas_above,zeros_above,nas_below,zeros_below)
print(xtable(na_table1,digits = 2),include.rownames = FALSE,
      type = "html",file = "output/tables/table_C1_na_full_sample.html")

#NAs for blocked outlets
nas_30 <- get_nas_censored(30)
print(xtable(nas_30,digits = 2),include.rownames = FALSE,
      type = "html",file = "output/tables/table_C2_1_30days.html")

nas_142 <- get_nas_censored(142)
print(xtable(nas_142,digits = 2),include.rownames = FALSE,
      type = "html",file = "output/tables/table_C2_2_142days.html")

#Save main datasets
saveRDS(sample_traffic_final_out_egypt,
        "processed/sample_traffic_final_out_egypt.rds")
saveRDS(sample_traffic_final_out_other,
        "processed/sample_traffic_final_out_other.rds")
saveRDS(sample_egypt_censored,
        "processed/sample_egypt_censored.rds")

#Measure duration
print(Sys.time() - start)